/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2020 Joyent, Inc.
 * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
 * Copyright 2024 Oxide Computer Company
 */

/*
 * This file drives topo node enumeration of NVMe controllers.  A single "nvme"
 * node is enumerated for each NVMe controller.   Child "disk" nodes are then
 * enumerated for each active or attached NVMe namespace.
 *
 * nvme nodes are expected to be enumerated under either a "bay" node (for U.2
 * devices) or a "slot" node (for M.2 devices) or a "pciexfn" node (for AIC
 * devices).
 *
 * Enumeration of NVMe controllers on PCIe add-in cards is automatically driven
 * by the pcibus topo module.
 *
 * In order to allow for associating a given NVMe controller with a physical
 * location, enumeration of U.2 and M.2 devices should be driven by a
 * platform-specific topo map which statically sets the following two
 * properties on the parent "bay" or "slot" node:
 *
 * propgroup        property        description
 * ---------        --------        ------------
 * binding          driver          "nvme"
 * binding          parent-device   devpath of parent PCIe device
 *
 * for example:
 *
 * <propgroup name="binding" version="1" name-stability="Private"
 *   data-stability="Private">
 *     <propval name="driver" type="string" value="nvme"/>
 *     <propval name="parent-device" type="string"
 *       value="/pci@0,0/pci8086,6f09@3,1"/>
 * </propgroup>
 * <dependents grouping="children">
 *     <range name="nvme" min="0" max="0">
 *         <enum-method name="disk" version="1"/>
 *     </range>
 * </dependents>
 */
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <stdbool.h>

#include <sys/fm/protocol.h>
#include <fm/topo_hc.h>
#include <fm/topo_mod.h>
#include <topo_ufm.h>

#include <sys/dkio.h>
#include <sys/scsi/generic/inquiry.h>

#include <libnvme.h>
#include "disk.h"
#include "disk_drivers.h"

typedef struct nvme_enum_info {
	topo_mod_t		*nei_mod;
	di_node_t		nei_dinode;
	nvme_t			*nei_libnvme;
	nvme_ctrl_t		*nei_ctrl;
	nvme_ctrl_info_t	*nei_ctrl_info;
	const nvme_version_t	*nei_vers;
	tnode_t			*nei_parent;
	tnode_t			*nei_nvme;
	nvlist_t		*nei_nvme_fmri;
	int			nei_fd;
} nvme_enum_info_t;

typedef struct devlink_arg {
	topo_mod_t		*dla_mod;
	char			*dla_logical_disk;
	uint_t			dla_strsz;
} devlink_arg_t;

static int
devlink_cb(di_devlink_t dl, void *arg)
{
	devlink_arg_t *dlarg = (devlink_arg_t *)arg;
	topo_mod_t *mod = dlarg->dla_mod;
	const char *devpath;
	char *slice, *ctds;

	if ((devpath = di_devlink_path(dl)) == NULL ||
	    (dlarg->dla_logical_disk = topo_mod_strdup(mod, devpath)) ==
	    NULL) {
		return (DI_WALK_TERMINATE);
	}

	/*
	 * We need to keep track of the original string size before we
	 * truncate it with a NUL, so that we can free the right number of
	 * bytes when we're done, otherwise libumem will complain.
	 */
	dlarg->dla_strsz = strlen(dlarg->dla_logical_disk) + 1;

	/* trim the slice off the public name */
	if (((ctds = strrchr(dlarg->dla_logical_disk, '/')) != NULL) &&
	    ((slice = strchr(ctds, 's')) != NULL))
		*slice = '\0';

	return (DI_WALK_TERMINATE);
}

static char *
get_logical_disk(topo_mod_t *mod, const char *devpath, uint_t *bufsz)
{
	di_devlink_handle_t devhdl;
	devlink_arg_t dlarg = { 0 };
	char *minorpath = NULL;

	if (asprintf(&minorpath, "%s:a", devpath) < 0) {
		return (NULL);
	}

	if ((devhdl = di_devlink_init(NULL, 0)) == DI_NODE_NIL) {
		topo_mod_dprintf(mod, "%s: di_devlink_init failed", __func__);
		free(minorpath);
		return (NULL);
	}

	dlarg.dla_mod = mod;

	(void) di_devlink_walk(devhdl, "^dsk/", minorpath, DI_PRIMARY_LINK,
	    &dlarg, devlink_cb);

	(void) di_devlink_fini(&devhdl);
	free(minorpath);

	*bufsz = dlarg.dla_strsz;
	return (dlarg.dla_logical_disk);
}

static bool
disk_nvme_make_ns_serial(topo_mod_t *mod, nvme_ns_info_t *ns_info, char *buf,
    size_t buflen)
{
	uint8_t nguid[16], eui64[8];
	int ret;

	if (nvme_ns_info_nguid(ns_info, nguid)) {
		ret = snprintf(buf, buflen, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X"
		    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
		    nguid[0], nguid[1], nguid[2], nguid[3], nguid[4],
		    nguid[5], nguid[6], nguid[7], nguid[8], nguid[9],
		    nguid[10], nguid[11], nguid[12], nguid[13], nguid[14],
		    nguid[15]);
	} else if (nvme_ns_info_eui64(ns_info, eui64)) {
		ret = snprintf(buf, buflen,
		    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
		    eui64[0], eui64[1], eui64[2], eui64[3], eui64[4],
		    eui64[5], eui64[6], eui64[7]);
	} else {
		ret = snprintf(buf, buflen, "%u", nvme_ns_info_nsid(ns_info));
	}

	if ((size_t)ret >= buflen) {
		topo_mod_dprintf(mod, "overflowed serial number for nsid %u: "
		    "needed %zu bytes, got %d", nvme_ns_info_nsid(ns_info),
		    buflen, ret);
		return (false);
	}

	return (true);
}

/*
 * Create the common I/O property group properties that are shared between
 * controllers and namespaces. We assume the property group was already created.
 */
static bool
disk_nvme_common_io(topo_mod_t *mod, tnode_t *tn, di_node_t di)
{
	int err;
	int inst = di_instance(di);
	const char *drv = di_driver_name(di);
	char *path;
	const char *ppaths[1];

	if (inst != -1 && topo_prop_set_uint32(tn, TOPO_PGROUP_IO,
	    TOPO_IO_INSTANCE, TOPO_PROP_IMMUTABLE, (uint32_t)inst, &err) != 0) {
		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
		    "%s", TOPO_PGROUP_IO, TOPO_IO_INSTANCE, topo_node_name(tn),
		    topo_node_instance(tn), topo_strerror(err));
		return (false);
	}

	if (drv != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO,
	    TOPO_IO_DRIVER, TOPO_PROP_IMMUTABLE, drv, &err) != 0) {
		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
		    "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn),
		    topo_node_instance(tn), topo_strerror(err));
		return (false);
	}

	if (drv != NULL) {
		nvlist_t *fmri = topo_mod_modfmri(mod, FM_MOD_SCHEME_VERSION,
		    drv);
		if (mod != NULL && topo_prop_set_fmri(tn, TOPO_PGROUP_IO,
		    TOPO_IO_MODULE, TOPO_PROP_IMMUTABLE, fmri, &err) != 0) {
			topo_mod_dprintf(mod, "failed to set %s:%s on %s[%"
			    PRIu64 "]: %s", TOPO_PGROUP_IO, TOPO_IO_MODULE,
			    topo_node_name(tn), topo_node_instance(tn),
			    topo_strerror(err));
			nvlist_free(fmri);
			return (false);
		}
		nvlist_free(fmri);
	}

	path = di_devfs_path(di);
	ppaths[0] = path;
	if (path != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO,
	    TOPO_IO_DEV_PATH, TOPO_PROP_IMMUTABLE, path, &err) != 0) {
		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
		    "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn),
		    topo_node_instance(tn), topo_strerror(err));
		di_devfs_path_free(path);
		return (false);
	}

	if (path != NULL && topo_prop_set_string_array(tn, TOPO_PGROUP_IO,
	    TOPO_IO_PHYS_PATH, TOPO_PROP_IMMUTABLE, ppaths, 1, &err) != 0) {
		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
		    "%s", TOPO_PGROUP_IO, TOPO_IO_PHYS_PATH, topo_node_name(tn),
		    topo_node_instance(tn), topo_strerror(err));
		di_devfs_path_free(path);
		return (false);
	}
	di_devfs_path_free(path);

	return (true);
}

/*
 * Add the various storage and I/O property group items that are appropriate
 * given that we have a devinfo node. The storage property group has already
 * been created, but the I/O property group has not.
 */
static void
disk_nvme_make_ns_di_props(topo_mod_t *mod, tnode_t *tn, di_node_t di)
{
	int err;
	char *devid, *mfg, *model, *rev, *serial, *log, *path;
	uint_t buflen;

	if (di_prop_lookup_strings(DDI_DEV_T_ANY, di, DEVID_PROP_NAME,
	    &devid) != 1 ||
	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_VENDOR_ID,
	    &mfg) != 1 ||
	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_PRODUCT_ID,
	    &model) != 1 ||
	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_REVISION_ID,
	    &rev) != 1 ||
	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_SERIAL_NO,
	    &serial) != 1) {
		topo_mod_dprintf(mod, "failed to get devinfo props for %s[%"
		    PRIu64 "]", topo_node_name(tn), topo_node_instance(tn));
		return;
	}

	/*
	 * Set the basic storage manufacturer information. Yes, this is
	 * information really about the NVMe controller and not the namespace.
	 * That's how the storage property group basically works here.
	 */
	if (topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
	    TOPO_STORAGE_MANUFACTURER, TOPO_PROP_IMMUTABLE, mfg, &err) != 0 ||
	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
	    TOPO_STORAGE_SERIAL_NUM, TOPO_PROP_IMMUTABLE, serial, &err) != 0 ||
	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
	    TOPO_STORAGE_FIRMWARE_REV, TOPO_PROP_IMMUTABLE, rev, &err) != 0 ||
	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
	    TOPO_STORAGE_MODEL, TOPO_PROP_IMMUTABLE, model, &err) != 0) {
		topo_mod_dprintf(mod, "failed to set storage properties on "
		    "%s[%" PRIu64 "]: %s", topo_node_name(tn),
		    topo_node_instance(tn), topo_strerror(err));
		return;
	}

	if (topo_pgroup_create(tn, &io_pgroup, &err) != 0) {
		topo_mod_dprintf(mod, "failed to create I/O property "
		    "group on %s[%" PRIu64 "]: %s",  topo_node_name(tn),
		    topo_node_instance(tn), topo_strerror(err));
	}

	if (!disk_nvme_common_io(mod, tn, di)) {
		return;
	}

	/*
	 * The last property that we'd like to attempt to create for a namespace
	 * is a mapping back to its corresponding logical disk entry in /dev.
	 * The logical disk will be everything past the trailing /, i.e. a
	 * cXtXdX value.
	 */
	path = di_devfs_path(di);
	if (path == NULL) {
		return;
	}
	log = get_logical_disk(mod, path, &buflen);
	di_devfs_path_free(path);
	if (log == NULL) {
		return;
	}
	path = strrchr(log, '/');
	if (path != NULL && path[1] != '\0' &&
	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
	    TOPO_STORAGE_LOGICAL_DISK_NAME, TOPO_PROP_IMMUTABLE, path + 1,
	    &err) != 0) {
		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%"
		    PRIu64 "]: %s", TOPO_PGROUP_STORAGE,
		    TOPO_STORAGE_LOGICAL_DISK_NAME, topo_node_name(tn),
		    topo_node_instance(tn), topo_strerror(err));
	}
	topo_mod_free(mod, log, buflen);
}

static void
disk_nvme_make_ns(nvme_enum_info_t *nei, nvme_ns_info_t *ns_info)
{
	topo_mod_t *mod = nei->nei_mod;
	nvlist_t *auth = NULL, *fmri = NULL;
	const uint32_t nsid = nvme_ns_info_nsid(ns_info);
	const topo_instance_t inst = nsid - 1;
	char serial[64], capstr[64];
	const nvme_nvm_lba_fmt_t *fmt;
	const char *bd_addr;
	uint64_t cap, blksz, capblks;
	tnode_t *tn;
	int err;

	auth = topo_mod_auth(mod, nei->nei_nvme);
	if (auth == NULL) {
		topo_mod_dprintf(mod, "failed to get auth for nsid %u from "
		    "parent %s[%" PRIu64 "]: %s", nsid,
		    topo_node_name(nei->nei_nvme),
		    topo_node_instance(nei->nei_nvme), topo_mod_errmsg(mod));
		goto done;
	}

	/*
	 * We want to construct the FMRI for the namespace. The namespace is a
	 * little awkward in terms of things like the model, revision, and
	 * serial. While blkdev sets up standard inquiry properties to map these
	 * to the parent device which makes sense in the context of trying to
	 * use this as a normal block device, it's not really appropriate here.
	 * The namespace is not the NVMe controller. We construct the namespace
	 * serial number from the preferential ordering of information that
	 * we're given of the NGUID, EUI64, and then fall back to the namespace
	 * number.
	 */
	if (!disk_nvme_make_ns_serial(mod, ns_info, serial, sizeof (serial))) {
		goto done;
	}
	fmri = topo_mod_hcfmri(mod, nei->nei_nvme, FM_HC_SCHEME_VERSION,
	    DISK, inst, NULL, auth, NULL, NULL, serial);
	if (fmri == NULL) {
		topo_mod_dprintf(mod, "failed to make fmri for %s[%" PRIu64
		    "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod));
		goto done;
	}

	tn = topo_node_bind(mod, nei->nei_nvme, DISK, inst, fmri);
	if (tn == NULL) {
		topo_mod_dprintf(mod, "failed to bind fmri for %s[%" PRIu64
		    "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod));
		goto done;
	}

	/*
	 * Always inherit our parent's FRU. The namespace is just a part of the
	 * device in reality.
	 */
	if (topo_node_fru_set(tn, NULL, 0, &err) != 0) {
		topo_mod_dprintf(mod, "failed to set FRU for %s[%" PRIu64
		    "] on nsid %u: %s", DISK, inst, nsid, topo_strerror(err));
		goto done;

	}

	/*
	 * Our namespace may or may not be attached. From the namespace we will
	 * always get the capacity and block information. The rest of it will
	 * end up being filled in if we find a devinfo node.
	 */
	if (topo_pgroup_create(tn, &storage_pgroup, &err) != 0) {
		topo_mod_dprintf(mod, "failed to create storage property "
		    "group on %s[%" PRIu64 "]: %s", DISK, inst,
		    topo_strerror(err));
	}

	if (!nvme_ns_info_curformat(ns_info, &fmt)) {
		topo_mod_dprintf(mod, "failed to get current namespace "
		    "format: %s", nvme_ns_info_errmsg(ns_info));
		goto done;
	}

	blksz = nvme_nvm_lba_fmt_data_size(fmt);
	if (topo_prop_set_uint64(tn, TOPO_PGROUP_STORAGE,
	    TOPO_STORAGE_LOG_BLOCK_SIZE, TOPO_PROP_IMMUTABLE, blksz, &err) !=
	    0) {
		topo_mod_dprintf(mod, "failed to create property %s:%s on %s[%"
		    PRIu64 "]: %s", TOPO_PGROUP_STORAGE,
		    TOPO_STORAGE_LOG_BLOCK_SIZE, DISK, inst,
		    topo_strerror(err));
		goto done;
	}

	if (!nvme_ns_info_cap(ns_info, &capblks)) {
		topo_mod_dprintf(mod, "failed to get namespace capacity: %s",
		    nvme_ns_info_errmsg(ns_info));
		goto done;
	}

	cap = blksz * capblks;
	if (snprintf(capstr, sizeof (capstr), "%" PRIu64, cap) >=
	    sizeof (capstr)) {
		topo_mod_dprintf(mod, "overflowed capacity calculation on "
		    "nsid %u", nsid);
		goto done;
	}

	/*
	 * Finally attempt to find a child node that has a matching name and go
	 * from there. Sorry, this does result in node creation being O(n^2),
	 * but at least n is usually small today. Note, we may not have a blkdev
	 * address because the disk may not be attached.
	 */
	if (!nvme_ns_info_bd_addr(ns_info, &bd_addr)) {
		if (nvme_ns_info_err(ns_info) != NVME_INFO_ERR_NS_NO_BLKDEV) {
			topo_mod_dprintf(mod, "failed to get namespace blkdev "
			    "address: %s", nvme_ns_info_errmsg(ns_info));
		}
		goto done;
	}

	for (di_node_t di = di_child_node(nei->nei_dinode); di != DI_NODE_NIL;
	    di = di_sibling_node(di)) {
		const char *addr = di_bus_addr(di);
		if (addr != NULL && strcmp(addr, bd_addr) == 0) {
			disk_nvme_make_ns_di_props(mod, tn, di);
		}
	}

done:
	nvlist_free(auth);
	nvlist_free(fmri);
}

/*
 * Attempt to make a ufm node, but swallow the error so we can try to get as
 * much of the disk information as possible.
 */
static void
disk_nvme_make_ufm(topo_mod_t *mod, nvme_enum_info_t *nei)
{
	topo_ufm_devinfo_t tud;
	char *path = di_devfs_path(nei->nei_dinode);
	if (path == NULL) {
		return;
	}

	tud.tud_method = TOPO_UFM_M_DEVINFO;
	tud.tud_path = path;
	if (topo_mod_load(mod, TOPO_MOD_UFM, TOPO_VERSION) == NULL) {
		topo_mod_dprintf(mod, "disk enum could not load ufm module");
		di_devfs_path_free(path);
		return;
	}

	(void) topo_mod_enumerate(mod, nei->nei_nvme, TOPO_MOD_UFM, UFM, 0, 0,
	    &tud);
	di_devfs_path_free(path);
}

static const topo_pgroup_info_t nvme_pgroup = {
	TOPO_PGROUP_NVME,
	TOPO_STABILITY_PRIVATE,
	TOPO_STABILITY_PRIVATE,
	1
};

static int
make_nvme_node(nvme_enum_info_t *nvme_info)
{
	topo_mod_t *mod = nvme_info->nei_mod;
	nvme_ctrl_info_t *info = nvme_info->nei_ctrl_info;
	nvme_ns_iter_t *iter = NULL;
	nvme_iter_t nret;
	const nvme_ns_disc_t *disc;
	nvlist_t *auth = NULL, *fmri = NULL, *fru;
	tnode_t *nvme;
	char *model = NULL, *serial = NULL, *vers = NULL;
	char *pname = topo_node_name(nvme_info->nei_parent);
	char *label = NULL;
	topo_instance_t pinst = topo_node_instance(nvme_info->nei_parent);
	int err = 0, ret = -1;

	/*
	 * Pass the model and serial strings through a function that sanitizes
	 * them of any characters that can't be used in an FMRI string. Note, we
	 * do not use the firmware revision here because that's not really a
	 * device property that should be part of the FMRI (it can be changed at
	 * runtime).
	 */
	model = topo_mod_clean_str(mod, nvme_ctrl_info_model(info));
	serial = topo_mod_clean_str(mod, nvme_ctrl_info_serial(info));

	auth = topo_mod_auth(mod, nvme_info->nei_parent);
	fmri = topo_mod_hcfmri(mod, nvme_info->nei_parent, FM_HC_SCHEME_VERSION,
	    NVME, 0, NULL, auth, model, NULL, serial);

	if (fmri == NULL) {
		/* errno set */
		topo_mod_dprintf(mod, "%s: hcfmri failed for %s=%" PRIu64
		    "/%s=0", __func__, pname, pinst, NVME);
		goto error;
	}

	/*
	 * If our parent is a pciexfn node, then we need to create a nvme range
	 * underneath it to hold the nvme hierarchy.  For other cases, where
	 * enumeration is being driven by a topo map file, this range will have
	 * already been statically defined in the XML.
	 */
	if (strcmp(pname, PCIEX_FUNCTION) == 0) {
		if (topo_node_range_create(mod, nvme_info->nei_parent, NVME, 0,
		    0) < 0) {
			/* errno set */
			topo_mod_dprintf(mod, "%s: error creating %s range",
			    __func__, NVME);
			goto error;
		}
	}

	/*
	 * Create a new topo node to represent the NVMe controller and bind it
	 * to the parent node.
	 */
	if ((nvme = topo_node_bind(mod, nvme_info->nei_parent, NVME, 0,
	    fmri)) == NULL) {
		/* errno set */
		topo_mod_dprintf(mod, "%s: bind failed for %s=%" PRIu64
		    "/%s=0", __func__, pname, pinst, NVME);
		goto error;
	}
	nvme_info->nei_nvme = nvme;
	nvme_info->nei_nvme_fmri = fmri;

	/*
	 * If our parent node is a "pciexfn" node then this is a NVMe device on
	 * a PCIe AIC, so we inherit our parent's FRU.  Otherwise, we set the
	 * FRU to ourself.
	 */
	if (strcmp(topo_node_name(nvme_info->nei_parent), PCIEX_FUNCTION) == 0)
		fru = NULL;
	else
		fru = fmri;

	if (topo_node_fru_set(nvme, fru, 0, &err) != 0) {
		topo_mod_dprintf(mod, "%s: failed to set FRU: %s", __func__,
		    topo_strerror(err));
		(void) topo_mod_seterrno(mod, err);
		goto error;
	}

	/*
	 * Clone the label from our parent node.  We can't inherit the property
	 * because the label prop is mutable on bay nodes and only immutable
	 * properties can be inherited.
	 */
	if ((topo_node_label(nvme_info->nei_parent, &label, &err) != 0 &&
	    err != ETOPO_PROP_NOENT) ||
	    topo_node_label_set(nvme, label, &err) != 0) {
		topo_mod_dprintf(mod, "%s: failed to set label: %s",
		    __func__, topo_strerror(err));
		(void) topo_mod_seterrno(mod, err);
		goto error;
	}

	/*
	 * Ensure that we have a UFM property set based on our devinfo path.
	 * This is a little repetitive if our parent actually did so as well,
	 * but given that the majority of such nodes are under bays and slots
	 * right now, it's a worthwhile tradeoff.
	 */
	disk_nvme_make_ufm(mod, nvme_info);

	if (topo_pgroup_create(nvme, &nvme_pgroup, &err) != 0) {
		topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s",
		    __func__, TOPO_PGROUP_NVME, topo_strerror(err));
		(void) topo_mod_seterrno(mod, err);
		goto error;
	}

	if (asprintf(&vers, "%u.%u", nvme_info->nei_vers->v_major,
	    nvme_info->nei_vers->v_minor) < 0) {
		topo_mod_dprintf(mod, "%s: failed to alloc string", __func__);
		(void) topo_mod_seterrno(mod, EMOD_NOMEM);
		goto error;
	}
	if (topo_prop_set_string(nvme, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER,
	    TOPO_PROP_IMMUTABLE, vers, &err) != 0) {
		topo_mod_dprintf(mod, "%s: failed to set %s/%s property",
		    __func__, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER);
		(void) topo_mod_seterrno(mod, err);
		goto error;
	}

	if (topo_pgroup_create(nvme, &io_pgroup, &err) != 0) {
		topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s",
		    __func__, TOPO_PGROUP_IO, topo_strerror(err));
		(void) topo_mod_seterrno(mod, err);
		goto error;
	}

	if (!disk_nvme_common_io(mod, nvme, nvme_info->nei_dinode)) {
		goto error;
	}

	/*
	 * Create a child disk node for each namespace.
	 */
	if (topo_node_range_create(mod, nvme, DISK, 0,
	    nvme_ctrl_info_nns(info) - 1) < 0) {
		/* errno set */
		topo_mod_dprintf(mod, "%s: error creating %s range", __func__,
		    DISK);
		goto error;
	}

	/*
	 * Iterate over each namespace to see if it's a candidate for inclusion.
	 * Namespaces start at index 1 and not every namespace will be included.
	 * We map things such that a disk instance is always namespace - 1 to
	 * fit into the above mapping.
	 */
	if (!nvme_ns_discover_init(nvme_info->nei_ctrl,
	    NVME_NS_DISC_F_NOT_IGNORED, &iter)) {
		topo_mod_dprintf(mod, "failed to initialize namespace "
		    "discovery: %s", nvme_errmsg(nvme_info->nei_libnvme));
		ret = topo_mod_seterrno(mod, EMOD_UNKNOWN);
		goto error;
	}

	for (nret = nvme_ns_discover_step(iter, &disc); nret == NVME_ITER_VALID;
	    nret = nvme_ns_discover_step(iter, &disc)) {
		nvme_ns_info_t *ns_info;
		uint32_t nsid = nvme_ns_disc_nsid(disc);

		if (!nvme_ctrl_ns_info_snap(nvme_info->nei_ctrl, nsid,
		    &ns_info)) {
			topo_mod_dprintf(mod, "failed to get namespace "
			    "information for ns %u: %s", nsid,
			    nvme_errmsg(nvme_info->nei_libnvme));
			ret = topo_mod_seterrno(mod, EMOD_UNKNOWN);
			goto error;
		}

		disk_nvme_make_ns(nvme_info, ns_info);
		nvme_ns_info_free(ns_info);
	}

	if (nret == NVME_ITER_ERROR) {
		topo_mod_dprintf(mod, "namespace discovery failed: %s",
		    nvme_errmsg(nvme_info->nei_libnvme));
		ret = topo_mod_seterrno(mod, EMOD_UNKNOWN);
	}
	ret = 0;

error:
	nvme_ns_discover_fini(iter);
	free(vers);
	nvlist_free(auth);
	nvlist_free(fmri);
	topo_mod_strfree(mod, model);
	topo_mod_strfree(mod, serial);
	topo_mod_strfree(mod, label);
	return (ret);
}

/*
 * This function gathers identity information from the NVMe controller and
 * stores it in a struct.  This struct is passed to make_nvme_node(), which
 * does the actual topo node creation.
 */
static int
discover_nvme_ctl(topo_mod_t *mod, tnode_t *pnode, di_node_t dinode)
{
	topo_disk_t *disk = topo_mod_getspecific(mod);
	nvme_enum_info_t nvme_info = { 0 };
	int ret;

	nvme_info.nei_mod = mod;
	nvme_info.nei_dinode = dinode;
	nvme_info.nei_parent = pnode;
	nvme_info.nei_libnvme = disk->td_nvme;

	if (!nvme_ctrl_init(disk->td_nvme, dinode, &nvme_info.nei_ctrl)) {
		topo_mod_dprintf(mod, "failed to initialize nvme_ctrl_t: %s",
		    nvme_errmsg(disk->td_nvme));
		return (topo_mod_seterrno(mod, EMOD_UNKNOWN));
	}

	if (!nvme_ctrl_info_snap(nvme_info.nei_ctrl,
	    &nvme_info.nei_ctrl_info)) {
		topo_mod_dprintf(mod, "failed to initialize nvme_ctrl_t: %s",
		    nvme_errmsg(disk->td_nvme));
		ret = topo_mod_seterrno(mod, EMOD_UNKNOWN);
		goto error;
	}

	nvme_info.nei_vers = nvme_ctrl_info_version(nvme_info.nei_ctrl_info);

	if ((ret = make_nvme_node(&nvme_info)) != 0) {
		goto error;
	}

error:
	if (nvme_info.nei_ctrl_info != NULL)
		nvme_ctrl_info_free(nvme_info.nei_ctrl_info);
	if (nvme_info.nei_ctrl != NULL)
		nvme_ctrl_fini(nvme_info.nei_ctrl);
	return (ret);
}

int
disk_nvme_enum_disk(topo_mod_t *mod, tnode_t *pnode)
{
	char *parent = NULL;
	int err;
	di_node_t devtree;
	di_node_t dnode;
	int ret = -1;

	/*
	 * Lookup a property containing the devfs path of the parent PCIe
	 * device of the NVMe device we're attempting to enumerate.  This
	 * property is hard-coded in per-platform topo XML maps that are
	 * delivered with the OS.  This hard-coded path allows topo to map a
	 * given NVMe controller to a physical location (bay or slot) on the
	 * platform, when generating the topo snapshot.
	 */
	if (topo_prop_get_string(pnode, TOPO_PGROUP_BINDING,
	    TOPO_BINDING_PARENT_DEV, &parent, &err) != 0) {
		topo_mod_dprintf(mod, "parent node was missing nvme binding "
		    "properties\n");
		(void) topo_mod_seterrno(mod, err);
		goto out;
	}
	if ((devtree = topo_mod_devinfo(mod)) == DI_NODE_NIL) {
		topo_mod_dprintf(mod, "failed to get devinfo snapshot");
		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
		goto out;
	}

	/*
	 * Walk the devinfo tree looking NVMe devices. For each NVMe device,
	 * check if the devfs path of the parent matches the one specified in
	 * TOPO_BINDING_PARENT_DEV.
	 */
	dnode = di_drv_first_node(NVME_DRV, devtree);
	while (dnode != DI_NODE_NIL) {
		char *path;

		if ((path = di_devfs_path(di_parent_node(dnode))) == NULL) {
			topo_mod_dprintf(mod, "failed to get dev path");
			(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
			goto out;
		}
		if (strcmp(parent, path) == 0) {
			ret = discover_nvme_ctl(mod, pnode, dnode);
			di_devfs_path_free(path);
			goto out;
		}
		di_devfs_path_free(path);
		dnode = di_drv_next_node(dnode);
	}
	ret = 0;

out:
	topo_mod_strfree(mod, parent);
	return (ret);
}
